{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "from os.path import dirname, realpath, join\n",
    "base_dir = dirname(dirname(os.getcwd()))\n",
    "\n",
    "import pandas as pd\n",
    "from os.path import join"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/Users/haithamelmarakeby/PycharmProjects/pnet2'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_dir"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SUC Fusions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "sys.path.insert(0, base_dir)\n",
    "from config_path import PROSTATE_DATA_PATH, PLOTS_PATH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "fusions_file = join(PROSTATE_DATA_PATH, 'raw_data/outputs_p1000_n=660_star_fusion.tsv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "fusions_data = pd.read_csv(fusions_file, sep='\\t',index_col=1 )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>seq_type</th>\n",
       "      <th>#FusionName</th>\n",
       "      <th>JunctionReadCount</th>\n",
       "      <th>SpanningFragCount</th>\n",
       "      <th>FFPM</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>MO_1008-Tumor_Dura</th>\n",
       "      <td>0</td>\n",
       "      <td>tcap</td>\n",
       "      <td>EIF4A2--ETV5</td>\n",
       "      <td>137.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.5101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MO_1012-Tumor-Subcutaneous_nodule</th>\n",
       "      <td>1</td>\n",
       "      <td>polyA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MO_1013-Tumor</th>\n",
       "      <td>2</td>\n",
       "      <td>polyA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MO_1014-Tumor</th>\n",
       "      <td>3</td>\n",
       "      <td>polyA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MO_1015-Tumor</th>\n",
       "      <td>4</td>\n",
       "      <td>tcap</td>\n",
       "      <td>TMPRSS2--ERG</td>\n",
       "      <td>117.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.8226</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                   Unnamed: 0 seq_type   #FusionName  \\\n",
       "sample                                                                 \n",
       "MO_1008-Tumor_Dura                          0     tcap  EIF4A2--ETV5   \n",
       "MO_1012-Tumor-Subcutaneous_nodule           1    polyA           NaN   \n",
       "MO_1013-Tumor                               2    polyA           NaN   \n",
       "MO_1014-Tumor                               3    polyA           NaN   \n",
       "MO_1015-Tumor                               4     tcap  TMPRSS2--ERG   \n",
       "\n",
       "                                   JunctionReadCount  SpanningFragCount  \\\n",
       "sample                                                                    \n",
       "MO_1008-Tumor_Dura                             137.0                0.0   \n",
       "MO_1012-Tumor-Subcutaneous_nodule                NaN                NaN   \n",
       "MO_1013-Tumor                                    NaN                NaN   \n",
       "MO_1014-Tumor                                    NaN                NaN   \n",
       "MO_1015-Tumor                                  117.0                0.0   \n",
       "\n",
       "                                     FFPM  \n",
       "sample                                     \n",
       "MO_1008-Tumor_Dura                 2.5101  \n",
       "MO_1012-Tumor-Subcutaneous_nodule     NaN  \n",
       "MO_1013-Tumor                         NaN  \n",
       "MO_1014-Tumor                         NaN  \n",
       "MO_1015-Tumor                      2.8226  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fusions_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(682, 6)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fusions_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "660"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(fusions_data.index.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "35"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(fusions_data['#FusionName'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "response_file = join(PROSTATE_DATA_PATH,'processed/response_paper.csv')\n",
    "response_data = pd.read_csv(response_file, index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>response</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AAPC-STID0000011640-Tumor-SM-2XU1H</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAPC-STID0000021561-Tumor-SM-3RVWB</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAPC-STID0000011949-Tumor-SM-2XU1I</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAPC-STID0000021610-Tumor-SM-2XU13</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAPC-STID0000021537-Tumor-SM-3RVW7</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                    response\n",
       "id                                          \n",
       "AAPC-STID0000011640-Tumor-SM-2XU1H         0\n",
       "AAPC-STID0000021561-Tumor-SM-3RVWB         0\n",
       "AAPC-STID0000011949-Tumor-SM-2XU1I         0\n",
       "AAPC-STID0000021610-Tumor-SM-2XU13         0\n",
       "AAPC-STID0000021537-Tumor-SM-3RVW7         0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1013"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(response_data.index.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "mapping_file = join(PROSTATE_DATA_PATH, 'raw_data/sample_mapping.tsv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "mapping_ids = pd.read_csv(mapping_file, sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Tumor_Sample_Barcode</th>\n",
       "      <th>patient</th>\n",
       "      <th>rna_sample_id</th>\n",
       "      <th>tpm_col</th>\n",
       "      <th>fusion_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TCGA-EJ-5499</td>\n",
       "      <td>PRAD-TCGA-EJ-5499-Tumor-SM-1U3IG</td>\n",
       "      <td>PRAD-EJ-5499-TP</td>\n",
       "      <td>PRAD-EJ-5499-TP_polyA</td>\n",
       "      <td>PRAD-EJ-5499-TP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>MO_1012</td>\n",
       "      <td>MO_1012-Tumor-Abdomen_wall_nodule</td>\n",
       "      <td>MO_1012-Tumor-Subcutaneous_nodule</td>\n",
       "      <td>MO_1012-Tumor-Subcutaneous_nodule_polyA</td>\n",
       "      <td>MO_1012-Tumor-Subcutaneous_nodule</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>TCGA-CH-5752</td>\n",
       "      <td>PRAD-TCGA-CH-5752-Tumor-SM-1U3ID</td>\n",
       "      <td>PRAD-CH-5752-TP</td>\n",
       "      <td>PRAD-CH-5752-TP_polyA</td>\n",
       "      <td>PRAD-CH-5752-TP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>06-134H1_LN</td>\n",
       "      <td>06-134H1_LN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>SC_9126</td>\n",
       "      <td>SC_9126_Tumor</td>\n",
       "      <td>SC_9126_Tumor</td>\n",
       "      <td>SC_9126_Tumor_tcap</td>\n",
       "      <td>SC_9126_Tumor</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Tumor_Sample_Barcode                            patient  \\\n",
       "0         TCGA-EJ-5499   PRAD-TCGA-EJ-5499-Tumor-SM-1U3IG   \n",
       "1              MO_1012  MO_1012-Tumor-Abdomen_wall_nodule   \n",
       "2         TCGA-CH-5752   PRAD-TCGA-CH-5752-Tumor-SM-1U3ID   \n",
       "3          06-134H1_LN                        06-134H1_LN   \n",
       "4              SC_9126                      SC_9126_Tumor   \n",
       "\n",
       "                       rna_sample_id                                  tpm_col  \\\n",
       "0                    PRAD-EJ-5499-TP                    PRAD-EJ-5499-TP_polyA   \n",
       "1  MO_1012-Tumor-Subcutaneous_nodule  MO_1012-Tumor-Subcutaneous_nodule_polyA   \n",
       "2                    PRAD-CH-5752-TP                    PRAD-CH-5752-TP_polyA   \n",
       "3                                NaN                                      NaN   \n",
       "4                      SC_9126_Tumor                       SC_9126_Tumor_tcap   \n",
       "\n",
       "                         fusion_name  \n",
       "0                    PRAD-EJ-5499-TP  \n",
       "1  MO_1012-Tumor-Subcutaneous_nodule  \n",
       "2                    PRAD-CH-5752-TP  \n",
       "3                                NaN  \n",
       "4                      SC_9126_Tumor  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapping_ids.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "response_data_fusion = mapping_ids.join(response_data, on='Tumor_Sample_Barcode',  how='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Tumor_Sample_Barcode</th>\n",
       "      <th>patient</th>\n",
       "      <th>rna_sample_id</th>\n",
       "      <th>tpm_col</th>\n",
       "      <th>fusion_name</th>\n",
       "      <th>response</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TCGA-EJ-5499</td>\n",
       "      <td>PRAD-TCGA-EJ-5499-Tumor-SM-1U3IG</td>\n",
       "      <td>PRAD-EJ-5499-TP</td>\n",
       "      <td>PRAD-EJ-5499-TP_polyA</td>\n",
       "      <td>PRAD-EJ-5499-TP</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>MO_1012</td>\n",
       "      <td>MO_1012-Tumor-Abdomen_wall_nodule</td>\n",
       "      <td>MO_1012-Tumor-Subcutaneous_nodule</td>\n",
       "      <td>MO_1012-Tumor-Subcutaneous_nodule_polyA</td>\n",
       "      <td>MO_1012-Tumor-Subcutaneous_nodule</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>TCGA-CH-5752</td>\n",
       "      <td>PRAD-TCGA-CH-5752-Tumor-SM-1U3ID</td>\n",
       "      <td>PRAD-CH-5752-TP</td>\n",
       "      <td>PRAD-CH-5752-TP_polyA</td>\n",
       "      <td>PRAD-CH-5752-TP</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>06-134H1_LN</td>\n",
       "      <td>06-134H1_LN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>SC_9126</td>\n",
       "      <td>SC_9126_Tumor</td>\n",
       "      <td>SC_9126_Tumor</td>\n",
       "      <td>SC_9126_Tumor_tcap</td>\n",
       "      <td>SC_9126_Tumor</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Tumor_Sample_Barcode                            patient  \\\n",
       "0         TCGA-EJ-5499   PRAD-TCGA-EJ-5499-Tumor-SM-1U3IG   \n",
       "1              MO_1012  MO_1012-Tumor-Abdomen_wall_nodule   \n",
       "2         TCGA-CH-5752   PRAD-TCGA-CH-5752-Tumor-SM-1U3ID   \n",
       "3          06-134H1_LN                        06-134H1_LN   \n",
       "4              SC_9126                      SC_9126_Tumor   \n",
       "\n",
       "                       rna_sample_id                                  tpm_col  \\\n",
       "0                    PRAD-EJ-5499-TP                    PRAD-EJ-5499-TP_polyA   \n",
       "1  MO_1012-Tumor-Subcutaneous_nodule  MO_1012-Tumor-Subcutaneous_nodule_polyA   \n",
       "2                    PRAD-CH-5752-TP                    PRAD-CH-5752-TP_polyA   \n",
       "3                                NaN                                      NaN   \n",
       "4                      SC_9126_Tumor                       SC_9126_Tumor_tcap   \n",
       "\n",
       "                         fusion_name  response  \n",
       "0                    PRAD-EJ-5499-TP         0  \n",
       "1  MO_1012-Tumor-Subcutaneous_nodule         1  \n",
       "2                    PRAD-CH-5752-TP         0  \n",
       "3                                NaN         1  \n",
       "4                      SC_9126_Tumor         1  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response_data_fusion.shape\n",
    "response_data_fusion.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "fusions_data_with_response = response_data_fusion.join(fusions_data, on='fusion_name', how='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(681, 12)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fusions_data_with_response.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Tumor_Sample_Barcode</th>\n",
       "      <th>patient</th>\n",
       "      <th>rna_sample_id</th>\n",
       "      <th>tpm_col</th>\n",
       "      <th>fusion_name</th>\n",
       "      <th>response</th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>seq_type</th>\n",
       "      <th>#FusionName</th>\n",
       "      <th>JunctionReadCount</th>\n",
       "      <th>SpanningFragCount</th>\n",
       "      <th>FFPM</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TCGA-EJ-5499</td>\n",
       "      <td>PRAD-TCGA-EJ-5499-Tumor-SM-1U3IG</td>\n",
       "      <td>PRAD-EJ-5499-TP</td>\n",
       "      <td>PRAD-EJ-5499-TP_polyA</td>\n",
       "      <td>PRAD-EJ-5499-TP</td>\n",
       "      <td>0</td>\n",
       "      <td>119</td>\n",
       "      <td>polyA</td>\n",
       "      <td>TMPRSS2--ERG</td>\n",
       "      <td>52.0</td>\n",
       "      <td>131.0</td>\n",
       "      <td>2.3998</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>MO_1012</td>\n",
       "      <td>MO_1012-Tumor-Abdomen_wall_nodule</td>\n",
       "      <td>MO_1012-Tumor-Subcutaneous_nodule</td>\n",
       "      <td>MO_1012-Tumor-Subcutaneous_nodule_polyA</td>\n",
       "      <td>MO_1012-Tumor-Subcutaneous_nodule</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>polyA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>TCGA-CH-5752</td>\n",
       "      <td>PRAD-TCGA-CH-5752-Tumor-SM-1U3ID</td>\n",
       "      <td>PRAD-CH-5752-TP</td>\n",
       "      <td>PRAD-CH-5752-TP_polyA</td>\n",
       "      <td>PRAD-CH-5752-TP</td>\n",
       "      <td>0</td>\n",
       "      <td>94</td>\n",
       "      <td>polyA</td>\n",
       "      <td>TMPRSS2--ERG</td>\n",
       "      <td>69.0</td>\n",
       "      <td>156.0</td>\n",
       "      <td>2.8597</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>SC_9126</td>\n",
       "      <td>SC_9126_Tumor</td>\n",
       "      <td>SC_9126_Tumor</td>\n",
       "      <td>SC_9126_Tumor_tcap</td>\n",
       "      <td>SC_9126_Tumor</td>\n",
       "      <td>1</td>\n",
       "      <td>642</td>\n",
       "      <td>tcap</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>PROS01448-6115227-SM-67ERU</td>\n",
       "      <td>PROS01448-6115227-Tumor-SM-67ERU</td>\n",
       "      <td>PROS01448-6115227-Tumor-SM-67ERU</td>\n",
       "      <td>PROS01448-6115227-Tumor-SM-67ERU_polyA</td>\n",
       "      <td>PROS01448-6115227-Tumor-SM-67ERU</td>\n",
       "      <td>1</td>\n",
       "      <td>551</td>\n",
       "      <td>polyA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         Tumor_Sample_Barcode                            patient  \\\n",
       "0                TCGA-EJ-5499   PRAD-TCGA-EJ-5499-Tumor-SM-1U3IG   \n",
       "1                     MO_1012  MO_1012-Tumor-Abdomen_wall_nodule   \n",
       "2                TCGA-CH-5752   PRAD-TCGA-CH-5752-Tumor-SM-1U3ID   \n",
       "4                     SC_9126                      SC_9126_Tumor   \n",
       "6  PROS01448-6115227-SM-67ERU   PROS01448-6115227-Tumor-SM-67ERU   \n",
       "\n",
       "                       rna_sample_id                                  tpm_col  \\\n",
       "0                    PRAD-EJ-5499-TP                    PRAD-EJ-5499-TP_polyA   \n",
       "1  MO_1012-Tumor-Subcutaneous_nodule  MO_1012-Tumor-Subcutaneous_nodule_polyA   \n",
       "2                    PRAD-CH-5752-TP                    PRAD-CH-5752-TP_polyA   \n",
       "4                      SC_9126_Tumor                       SC_9126_Tumor_tcap   \n",
       "6   PROS01448-6115227-Tumor-SM-67ERU   PROS01448-6115227-Tumor-SM-67ERU_polyA   \n",
       "\n",
       "                         fusion_name  response  Unnamed: 0 seq_type  \\\n",
       "0                    PRAD-EJ-5499-TP         0         119    polyA   \n",
       "1  MO_1012-Tumor-Subcutaneous_nodule         1           1    polyA   \n",
       "2                    PRAD-CH-5752-TP         0          94    polyA   \n",
       "4                      SC_9126_Tumor         1         642     tcap   \n",
       "6   PROS01448-6115227-Tumor-SM-67ERU         1         551    polyA   \n",
       "\n",
       "    #FusionName  JunctionReadCount  SpanningFragCount    FFPM  \n",
       "0  TMPRSS2--ERG               52.0              131.0  2.3998  \n",
       "1           NaN                NaN                NaN     NaN  \n",
       "2  TMPRSS2--ERG               69.0              156.0  2.8597  \n",
       "4           NaN                NaN                NaN     NaN  \n",
       "6           NaN                NaN                NaN     NaN  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fusions_data_with_response.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(681, 12)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fusions_data_with_response.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "659"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(fusions_data_with_response.Tumor_Sample_Barcode.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "360"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(fusions_data_with_response.loc[:,'#FusionName'].isna())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "response\n",
       "0    472\n",
       "1    209\n",
       "Name: fusion_name, dtype: int64"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fusions_data_with_response.groupby('response')['fusion_name'].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    472\n",
       "1    209\n",
       "Name: response, dtype: int64"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fusions_data_with_response.response.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "p1000_fusions = fusions_data_with_response[['Tumor_Sample_Barcode', '#FusionName']].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "# p1000_fusions['#FusionName'].fillna(0, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "not_na_ind= p1000_fusions['#FusionName'].notna()\n",
    "na_ind= p1000_fusions['#FusionName'].isna()\n",
    "p1000_fusions.loc[not_na_ind, '#FusionName']=1\n",
    "p1000_fusions.loc[na_ind, '#FusionName']=0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Tumor_Sample_Barcode</th>\n",
       "      <th>#FusionName</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TCGA-EJ-5499</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>MO_1012</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>TCGA-CH-5752</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>SC_9126</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>PROS01448-6115227-SM-67ERU</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         Tumor_Sample_Barcode #FusionName\n",
       "0                TCGA-EJ-5499           1\n",
       "1                     MO_1012           0\n",
       "2                TCGA-CH-5752           1\n",
       "4                     SC_9126           0\n",
       "6  PROS01448-6115227-SM-67ERU           0"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p1000_fusions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "321"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p1000_fusions['#FusionName'].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "p1000_fusions = p1000_fusions.rename(columns={'#FusionName':'fusion_indicator'})#FusionName"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Tumor_Sample_Barcode</th>\n",
       "      <th>fusion_indicator</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TCGA-EJ-5499</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>MO_1012</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>TCGA-CH-5752</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>SC_9126</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>PROS01448-6115227-SM-67ERU</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         Tumor_Sample_Barcode fusion_indicator\n",
       "0                TCGA-EJ-5499                1\n",
       "1                     MO_1012                0\n",
       "2                TCGA-CH-5752                1\n",
       "4                     SC_9126                0\n",
       "6  PROS01448-6115227-SM-67ERU                0"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p1000_fusions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "p1000_fusions_unique = p1000_fusions.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(659, 2)"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p1000_fusions_unique.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## saving Fusions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "p1000_fusions_unique.to_csv(join(PROSTATE_DATA_PATH,'processed/p1000_onco_ets_fusions.csv'), index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:min_env]",
   "language": "python",
   "name": "conda-env-min_env-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
